---
title: "Working on Data Aggregated from Python"
author: "Zachary Steinert-Threlkeld"
output:
  pdf_document:
    number_sections: yes
    toc: yes
    toc_depth: 4
---


# Setup
```{r setup, include=TRUE, cache=TRUE}
#knitr::opts_chunk$set(echo = TRUE)
#knitr::opts_knit$set(root.dir ='<path/to/Replication/>')
#library(knitr)
knitr::opts_chunk$set(tidy.opts=list(width.cutoff=80),tidy=TRUE)
library(stargazer)
library(tidyr)  # For complete function
library(reshape2)  # For melt
library(dplyr) # For aggregation
```

```{r}
#setwd('<path/to/Replication/>')
```


# Make Functions
These functions will add the number of people who take protest photos as well as the number of protests photos.  It will also add missing days.  One function does both of those.  The next adds lags per correct level of aggregation and then labels the days of the week.
```{r}
splitVE <- function(data){
  data$place.country_code <- as.character(data$place.country_code)  # To make sure everything is a string
  temp <- data[data$place.country_code == 'VE',]
  period <- ifelse(grepl('2014|2015', temp$day)==TRUE, 1, 2) # 2 periods, easy to handle with ifelse
  temp$place.country_code <- paste0(temp$place.country_code, period, sep='') # Replace old country name with new, has period indicator
  
  data[data$place.country_code == 'VE',] <- temp # Replace VE in full dataset with the temp dataframe
  
  return(data)
}

addMissingDays <- function(data_path, count_path, agg_unit='countryday'){
  ## Open data
  data <- read.csv(data_path, stringsAsFactors=FALSE)

  ## Add the Venezuela periods. Necessary so don't get so many empty days.
  data <- splitVE(data)
  
  ## Add Missing Days
  if(agg_unit == 'countryday'){
      
    ## Number of Unique Users Based on Images
    thecount <- read.csv(count_path)
    names(thecount)[names(thecount)=='user.id'] <- 'Protesters'
    names(thecount)[names(thecount)=='user.id_lag'] <- 'Protesters_lag'
    names(thecount)[names(thecount)=='tweets'] <- 'Protest_photos'
    names(thecount)[names(thecount)=='tweets_lag'] <- 'Protest_photos_lag'
    
    thecount <- splitVE(data=thecount)
    
    data <- merge(data, thecount, by.x=c('place.country_code','day'), by.y=c('place.country_code', 'day'))
  
    ## Group to do missing days
    temp <- as.data.frame(data %>% group_by(place.country_code) %>% mutate(day = as.Date(day)) %>% complete(day=seq.Date(min(day), max(day), by='day')))
    
    # Replace only NA for variables that aren't lag.
    # Replacing lag with 0 would be wrong because lag needs to be previous value.
    tempa <- temp[grep('_lag', names(temp), invert=TRUE)]
    tempa[is.na(tempa)] <- 0
    
    # Merge back in
    temp[names(tempa)] <- tempa
    
    # Rename
    data <- temp
    }

  if(agg_unit == 'stateday'){
    ## Number of Unique Users Based on Images
    thecount <- read.csv(count_path)
    names(thecount)[names(thecount)=='user.id'] <- 'Protesters'
    names(thecount)[names(thecount)=='user.id_lag'] <- 'Protesters_lag'
    names(thecount)[names(thecount)=='tweets'] <- 'Protest_photos'
    names(thecount)[names(thecount)=='tweets_lag'] <- 'Protest_photos_lag'
    
    thecount <- splitVE(data=thecount)

    data <- merge(data, thecount, by.x=c('place.country_code', 'rg.state', 'day'), by.y=c('place.country_code', 'rg.state', 'day'))
    
    ## Add lags
    temp <- as.data.frame(data %>% group_by(place.country_code, rg.state) %>% mutate(day = as.Date(day)) %>% complete(day=seq.Date(min(day), max(day), by='day')))
    
    # Replace only NA for variables that aren't lag.
    tempa <- temp[grep('_lag', names(temp), invert=TRUE)]
    tempa[is.na(tempa)] <- 0
    
    # Merge back in
    temp[names(tempa)] <- tempa
    
    # Rename
    data <- temp
    } 
 
  if(agg_unit == 'cityday'){
    ## Number of Unique Users Based on Images
    thecount <- read.csv(count_path)
    names(thecount)[names(thecount)=='user.id'] <- 'Protesters'
    names(thecount)[names(thecount)=='user.id_lag'] <- 'Protesters_lag'
    names(thecount)[names(thecount)=='tweets'] <- 'Protest_photos'
    names(thecount)[names(thecount)=='tweets_lag'] <- 'Protest_photos_lag'
    
    thecount <- splitVE(data=thecount)

    data <- merge(data, thecount, by.x=c('place.country_code', 'rg.state', 'city_use', 'day'), by.y=c('place.country_code', 'rg.state', 'city_use', 'day'))
    
    ## Add lags
    temp <- as.data.frame(data %>% group_by(place.country_code, rg.state, city_use) %>% mutate(day = as.Date(day)) %>% complete(day=seq.Date(min(day), max(day), by='day')))
    
    # Replace only NA for variables that aren't lag.
    tempa <- temp[grep('_lag', names(temp), invert=TRUE)]
    tempa[is.na(tempa)] <- 0
    
    # Merge back in
    temp[names(tempa)] <- tempa
    
    # Rename
    data <- temp
  } 
  
  return(data)
}
```

Lags now need to be updated.  Before, lags were for previous observations, which was not necessarily the previous day.  Now, lag will be previous day, including if the previous day was just added as missing data.

```{r}
redoLags <- function(data, agg_unit='countryday'){
  if(agg_unit=='countryday'){
      countries <- unique(data$place.country_code)

      for(i in 1:length(countries)){
          temp <- data[data$place.country_code == countries[i],]
          columns <- names(temp)

          for(j in 1:length(columns)){
            if(grepl('_lag', columns[j]) == TRUE){
              thiscolumn <- gsub('_lag', '', columns[j])  # We will want to lag the original column
              stupid <- lag(temp[[thiscolumn]], n=1)
              temp[columns[j]] <- stupid
            }
          }
          # Replace country with the new, correct lags.
          data[data$place.country_code == countries[i],] <- temp
          }
  }

  if(agg_unit=='stateday'){
    data$key <- paste0(data$rg.state, data$place.country_code)
    keys <- unique(data$key)
  
    for(i in 1:length(keys)){
        temp <- data[data$key == keys[i],]
        columns <- names(temp)
  
        for(j in 1:length(columns)){
          if(grepl('_lag', columns[j]) == TRUE){
              thiscolumn <- gsub('_lag', '', columns[j])  # We will want to lag the original column
              stupid <- lag(temp[[thiscolumn]], n=1)
              temp[columns[j]] <- stupid
          }
        }
        # Replace country with the new, correct lags.
        data[data$key == keys[i],] <- temp
        }
  }

  if(agg_unit=='cityday'){
    data$key <- paste0(data$city_use, data$rg.state, data$place.country_code)
    keys <- unique(data$key)
  
    for(i in 1:length(keys)){
        temp <- data[data$key == keys[i],]
        columns <- names(temp)
  
        for(j in 1:length(columns)){
          if(grepl('_lag', columns[j]) == TRUE){  # Problem: need to find column that is not lagged, not the lag since lag was given NA
              thiscolumn <- gsub('_lag', '', columns[j])  # We will want to lag the original column
              stupid <- lag(temp[[thiscolumn]], n=1)
              temp[columns[j]] <- stupid
          }
        }
        # Replace country with the new, correct lags.
        data[data$key == keys[i],] <- temp
        }
  }
    
  ## Add day of week string, for later fixed effects
  data$dayOfWeek <- weekdays(as.Date(data$day), abbreviate=FALSE)

  return(data)
  }
```

## Take Logarithm of n_face, n_face_lag
```{r}
logDV <- function(data){
  # Below two lines are for vestigial reasons: don't have to update code in regression script since names now match.
  data$n_face <- data$faces_Sum
  data$n_face_lag <- data$faces_Sum_lag
  
  data$n_face_log <- log(data$faces_Sum + 1, 10)
  data$n_face_lag_log <- log(data$faces_Sum_lag + 1, 10)
  
  data$totalFaces_log <- log(data$totalFaces_Sum + 1, 10)
  data$totalFaces_lag_log <- log(data$totalFaces_Sum_lag + 1, 10)
  
  return(data)
}
  
```

# Full data
## Process countryday
```{r}
data <- addMissingDays(data_path='./Data/02_processedData/d_DonghyeonAlexmerged_countryday_withNewClassifierOutput.csv', count_path='./Data/02_processedData/d_DonghyeonAlexmerged_countryday_Protesters_Tweets.csv', agg_unit='countryday')
data <- redoLags(data=data, agg_unit='countryday')
data <- logDV(data)
write.csv(data, './Data/02_processedData/e_DonghyeonAlexmerged_countryday_UsersAndMissingDays.csv')
```

## Process stateday
```{r}
data <- addMissingDays(data_path='./Data/02_processedData/d_DonghyeonAlexmerged_stateday_withNewClassifierOutput.csv', count_path='./Data/02_processedData/d_DonghyeonAlexmerged_stateday_Protesters_Tweets.csv', agg_unit='stateday')
data <- redoLags(data=data, agg_unit='stateday')
data <- logDV(data)
write.csv(data, './Data/02_processedData/e_DonghyeonAlexmerged_stateday_UsersAndMissingDays.csv')
```


## Process cityday
```{r}
data <- addMissingDays(data_path='./Data/02_processedData/d_DonghyeonAlexmerged_cityday_withNewClassifierOutput.csv', count_path='./Data/02_processedData/d_DonghyeonAlexmerged_cityday_Protesters_Tweets.csv', agg_unit='cityday')
data <- redoLags(data=data, agg_unit='cityday')
data <- logDV(data)
write.csv(data, './Data/02_processedData/e_DonghyeonAlexmerged_cityday_UsersAndMissingDays.csv')
```


# Process robustness checks

## Verified Users
### Country
```{r}
data <- addMissingDays(data_path='./Data/02_processedData/d_DonghyeonAlexmerged_countryday_withNewClassifierOutput_verifiedAccounts.csv', count_path='./Data/02_processedData/d_DonghyeonAlexmerged_countryday_Protesters_Tweets_verifiedAccounts.csv', agg_unit='countryday')
data <- redoLags(data=data, agg_unit='cityday')
data <- logDV(data)
write.csv(data, './Data/02_processedData/e_DonghyeonAlexmerged_countryday_UsersAndMissingDays_verifiedAccounts.csv')
```

### State
```{r}
data <- addMissingDays(data_path='./Data/02_processedData/d_DonghyeonAlexmerged_stateday_withNewClassifierOutput_verifiedAccounts.csv', count_path='./Data/02_processedData/d_DonghyeonAlexmerged_stateday_Protesters_Tweets_verifiedAccounts.csv', agg_unit='stateday')
data <- redoLags(data=data, agg_unit='cityday')
data <- logDV(data)
write.csv(data, './Data/02_processedData/e_DonghyeonAlexmerged_stateday_UsersAndMissingDays_verifiedAccounts.csv')
```

### City
```{r}
data <- addMissingDays(data_path='./Data/02_processedData/d_DonghyeonAlexmerged_cityday_withNewClassifierOutput_verifiedAccounts.csv', count_path='./Data/02_processedData/d_DonghyeonAlexmerged_cityday_Protesters_Tweets_verifiedAccounts.csv', agg_unit='cityday')
data <- redoLags(data=data, agg_unit='cityday')
data <- logDV(data)
write.csv(data, './Data/02_processedData/e_DonghyeonAlexmerged_cityday_UsersAndMissingDays_verifiedAccounts.csv')
```


## Country Not Inferred
### Country
```{r}
data <- addMissingDays(data_path='./Data/02_processedData/d_DonghyeonAlexmerged_countryday_withNewClassifierOutput_goodBB.csv', count_path='./Data/02_processedData/d_DonghyeonAlexmerged_countryday_Protesters_Tweets_goodBB.csv', agg_unit='countryday')
data <- redoLags(data=data, agg_unit='cityday')
data <- logDV(data)
write.csv(data, './Data/02_processedData/e_DonghyeonAlexmerged_countryday_UsersAndMissingDays_goodBB.csv')
```

### State
```{r}
data <- addMissingDays(data_path='./Data/02_processedData/d_DonghyeonAlexmerged_stateday_withNewClassifierOutput_goodBB.csv', count_path='./Data/02_processedData/d_DonghyeonAlexmerged_stateday_Protesters_Tweets_goodBB.csv', agg_unit='stateday')
data <- redoLags(data=data, agg_unit='cityday')
data <- logDV(data)
write.csv(data, './Data/02_processedData/e_DonghyeonAlexmerged_stateday_UsersAndMissingDays_goodBB.csv')
```

### City
```{r}
data <- addMissingDays(data_path='./Data/02_processedData/d_DonghyeonAlexmerged_cityday_withNewClassifierOutput_goodBB.csv', count_path='./Data/02_processedData/d_DonghyeonAlexmerged_cityday_Protesters_Tweets_goodBB.csv', agg_unit='cityday')
data <- redoLags(data=data, agg_unit='cityday')
data <- logDV(data)
write.csv(data, './Data/02_processedData/e_DonghyeonAlexmerged_cityday_UsersAndMissingDays_goodBB.csv')
```



## Mobile Only
### Country
```{r}
data <- addMissingDays(data_path='./Data/02_processedData/d_DonghyeonAlexmerged_countryday_withNewClassifierOutput_mobileTweets.csv', count_path='./Data/02_processedData/d_DonghyeonAlexmerged_countryday_Protesters_Tweets_mobileTweets.csv', agg_unit='countryday')
data <- redoLags(data=data, agg_unit='cityday')
data <- logDV(data)
write.csv(data, './Data/02_processedData/e_DonghyeonAlexmerged_countryday_UsersAndMissingDays_mobileTweets.csv')
```

### State
```{r}
data <- addMissingDays(data_path='./Data/02_processedData/d_DonghyeonAlexmerged_stateday_withNewClassifierOutput_mobileTweets.csv', count_path='./Data/02_processedData/d_DonghyeonAlexmerged_stateday_Protesters_Tweets_mobileTweets.csv', agg_unit='stateday')
data <- redoLags(data=data, agg_unit='cityday')
data <- logDV(data)
write.csv(data, './Data/02_processedData/e_DonghyeonAlexmerged_stateday_UsersAndMissingDays_mobileTweets.csv')
```

### City
```{r}
data <- addMissingDays(data_path='./Data/02_processedData/d_DonghyeonAlexmerged_cityday_withNewClassifierOutput_mobileTweets.csv', count_path='./Data/02_processedData/d_DonghyeonAlexmerged_cityday_Protesters_Tweets_mobileTweets.csv', agg_unit='cityday')
data <- redoLags(data=data, agg_unit='cityday')
data <- logDV(data)
write.csv(data, './Data/02_processedData/e_DonghyeonAlexmerged_cityday_UsersAndMissingDays_mobileTweets.csv')
```



## Dominant Language
### Country
```{r}
data <- addMissingDays(data_path='./Data/02_processedData/d_DonghyeonAlexmerged_countryday_withNewClassifierOutput_dominantLanguage.csv', count_path='./Data/02_processedData/d_DonghyeonAlexmerged_countryday_Protesters_Tweets_dominantLanguage.csv', agg_unit='countryday')
data <- redoLags(data=data, agg_unit='cityday')
data <- logDV(data)
write.csv(data, './Data/02_processedData/e_DonghyeonAlexmerged_countryday_UsersAndMissingDays_dominantLanguage.csv')
```

### State
```{r}
data <- addMissingDays(data_path='./Data/02_processedData/d_DonghyeonAlexmerged_stateday_withNewClassifierOutput_dominantLanguage.csv', count_path='./Data/02_processedData/d_DonghyeonAlexmerged_stateday_Protesters_Tweets_dominantLanguage.csv', agg_unit='stateday')
data <- redoLags(data=data, agg_unit='cityday')
data <- logDV(data)
write.csv(data, './Data/02_processedData/e_DonghyeonAlexmerged_stateday_UsersAndMissingDays_dominantLanguage.csv')
```

### City
```{r}
data <- addMissingDays(data_path='./Data/02_processedData/d_DonghyeonAlexmerged_cityday_withNewClassifierOutput_dominantLanguage.csv', count_path='./Data/02_processedData/d_DonghyeonAlexmerged_cityday_Protesters_Tweets_dominantLanguage.csv', agg_unit='cityday')
data <- redoLags(data=data, agg_unit='cityday')
data <- logDV(data)
write.csv(data, './Data/02_processedData/e_DonghyeonAlexmerged_cityday_UsersAndMissingDays_dominantLanguage.csv')
```



## Narrow Time
### Country
```{r}
data <- addMissingDays(data_path='./Data/02_processedData/d_DonghyeonAlexmerged_countryday_withNewClassifierOutput_timeNarrow.csv', count_path='./Data/02_processedData/d_DonghyeonAlexmerged_countryday_Protesters_Tweets_timeNarrow.csv', agg_unit='countryday')
data <- redoLags(data=data, agg_unit='cityday')
data <- logDV(data)
write.csv(data, './Data/02_processedData/e_DonghyeonAlexmerged_countryday_UsersAndMissingDays_timeNarrow.csv')
```

### State
```{r}
data <- addMissingDays(data_path='./Data/02_processedData/d_DonghyeonAlexmerged_stateday_withNewClassifierOutput_timeNarrow.csv', count_path='./Data/02_processedData/d_DonghyeonAlexmerged_stateday_Protesters_Tweets_timeNarrow.csv', agg_unit='stateday')
data <- redoLags(data=data, agg_unit='cityday')
data <- logDV(data)
write.csv(data, './Data/02_processedData/e_DonghyeonAlexmerged_stateday_UsersAndMissingDays_timeNarrow.csv')
```

### City
```{r}
data <- addMissingDays(data_path='./Data/02_processedData/d_DonghyeonAlexmerged_cityday_withNewClassifierOutput_timeNarrow.csv', count_path='./Data/02_processedData/d_DonghyeonAlexmerged_cityday_Protesters_Tweets_timeNarrow.csv', agg_unit='cityday')
data <- redoLags(data=data, agg_unit='cityday')
data <- logDV(data)
write.csv(data, './Data/02_processedData/e_DonghyeonAlexmerged_cityday_UsersAndMissingDays_timeNarrow.csv')
```



## User Popularity
### Country
```{r}
data <- addMissingDays(data_path='./Data/02_processedData/d_DonghyeonAlexmerged_countryday_withNewClassifierOutput_2575Popularity.csv', count_path='./Data/02_processedData/d_DonghyeonAlexmerged_countryday_Protesters_Tweets_2575Popularity.csv', agg_unit='countryday')
data <- redoLags(data=data, agg_unit='cityday')
data <- logDV(data)
write.csv(data, './Data/02_processedData/e_DonghyeonAlexmerged_countryday_UsersAndMissingDays_2575Popularity.csv')
```

### State
```{r}
data <- addMissingDays(data_path='./Data/02_processedData/d_DonghyeonAlexmerged_stateday_withNewClassifierOutput_2575Popularity.csv', count_path='./Data/02_processedData/d_DonghyeonAlexmerged_stateday_Protesters_Tweets_2575Popularity.csv', agg_unit='stateday')
data <- redoLags(data=data, agg_unit='cityday')
data <- logDV(data)
write.csv(data, './Data/02_processedData/e_DonghyeonAlexmerged_stateday_UsersAndMissingDays_2575Popularity.csv')
```

### City
```{r}
data <- addMissingDays(data_path='./Data/02_processedData/d_DonghyeonAlexmerged_cityday_withNewClassifierOutput_2575Popularity.csv', count_path='./Data/02_processedData/d_DonghyeonAlexmerged_cityday_Protesters_Tweets_2575Popularity.csv', agg_unit='cityday')
data <- redoLags(data=data, agg_unit='cityday')
data <- logDV(data)
write.csv(data, './Data/02_processedData/e_DonghyeonAlexmerged_cityday_UsersAndMissingDays_2575Popularity.csv')
```


## Not Bot
### Country
```{r}
data <- addMissingDays(data_path='./Data/02_processedData/d_DonghyeonAlexmerged_countryday_withNewClassifierOutput_noBot.csv', count_path='./Data/02_processedData/d_DonghyeonAlexmerged_countryday_Protesters_Tweets_noBot.csv', agg_unit='countryday')
data <- redoLags(data=data, agg_unit='cityday')
data <- logDV(data)
write.csv(data, './Data/02_processedData/e_DonghyeonAlexmerged_countryday_UsersAndMissingDays_noBot.csv')
```

### State
```{r}
data <- addMissingDays(data_path='./Data/02_processedData/d_DonghyeonAlexmerged_stateday_withNewClassifierOutput_noBot.csv', count_path='./Data/02_processedData/d_DonghyeonAlexmerged_stateday_Protesters_Tweets_noBot.csv', agg_unit='stateday')
data <- redoLags(data=data, agg_unit='cityday')
data <- logDV(data)
write.csv(data, './Data/02_processedData/e_DonghyeonAlexmerged_stateday_UsersAndMissingDays_noBot.csv')
```

### City
```{r}
data <- addMissingDays(data_path='./Data/02_processedData/d_DonghyeonAlexmerged_cityday_withNewClassifierOutput_noBot.csv', count_path='./Data/02_processedData/d_DonghyeonAlexmerged_cityday_Protesters_Tweets_noBot.csv', agg_unit='cityday')
data <- redoLags(data=data, agg_unit='cityday')
data <- logDV(data)
write.csv(data, './Data/02_processedData/e_DonghyeonAlexmerged_cityday_UsersAndMissingDays_noBot.csv')
```



## No Duplicates
### Country
```{r}
data <- addMissingDays(data_path='./Data/02_processedData/d_DonghyeonAlexmerged_countryday_withNewClassifierOutput_noDuplicate.csv', count_path='./Data/02_processedData/d_DonghyeonAlexmerged_countryday_Protesters_Tweets_noDuplicate.csv', agg_unit='countryday')
data <- redoLags(data=data, agg_unit='cityday')
data <- logDV(data)
write.csv(data, './Data/02_processedData/e_DonghyeonAlexmerged_countryday_UsersAndMissingDays_noDuplicate.csv')
```

### State
```{r}
data <- addMissingDays(data_path='./Data/02_processedData/d_DonghyeonAlexmerged_stateday_withNewClassifierOutput_noDuplicate.csv', count_path='./Data/02_processedData/d_DonghyeonAlexmerged_stateday_Protesters_Tweets_noDuplicate.csv', agg_unit='stateday')
data <- redoLags(data=data, agg_unit='cityday')
data <- logDV(data)
write.csv(data, './Data/02_processedData/e_DonghyeonAlexmerged_stateday_UsersAndMissingDays_noDuplicate.csv')
```

### City
```{r}
data <- addMissingDays(data_path='./Data/02_processedData/d_DonghyeonAlexmerged_cityday_withNewClassifierOutput_noDuplicate.csv', count_path='./Data/02_processedData/d_DonghyeonAlexmerged_cityday_Protesters_Tweets_noDuplicate.csv', agg_unit='cityday')
data <- redoLags(data=data, agg_unit='cityday')
data <- logDV(data)
write.csv(data, './Data/02_processedData/e_DonghyeonAlexmerged_cityday_UsersAndMissingDays_noDuplicate.csv')
```


